1 Effect of UPSTM-Based Decorrelation on Feature Discovery

1.0.1 Loading the libraries

library("FRESA.CAD")
library(readxl)
library(igraph)
library(umap)
library(tsne)
library(entropy)

op <- par(no.readonly = TRUE)
pander::panderOptions('digits', 3)
pander::panderOptions('table.split.table', 400)
pander::panderOptions('keep.trailing.zeros',TRUE)

1.1 Material and Methods

Data Source https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6960825/

Mohino-Herranz I, Gil-Pita R, Rosa-Zurera M, Seoane F. Activity Recognition Using Wearable Physiological Measurements: Selection of Features from a Comprehensive Literature Study. Sensors (Basel). 2019 Dec 13;19(24):0. doi: 10.3390/s19245524. PMID: 31847261; PMCID: PMC6960825.

1.2 The Data

Activitydata <- read.csv("~/GitHub/LatentBiomarkers/Data/ActivityData/data.txt", header=FALSE, stringsAsFactors=TRUE)
featNames <- read.table("~/GitHub/LatentBiomarkers/Data/ActivityData/Featurelabels.txt", quote="\"", comment.char="")
featNames <- as.character(t(featNames))
featNames <- str_replace_all(featNames,"\\(abs\\)","_A_")
featNames[duplicated(featNames)] <- paste(featNames[duplicated(featNames)],"D",sep="_")

rep_ID <- numeric(nrow(Activitydata))
ctr <- 1
for (ind in c(1:(nrow(Activitydata)-1)))
{
  rep_ID[ind] <- ctr
  if (Activitydata$V1[ind] != Activitydata$V1[ind+1]) ctr <- 0;
  ctr <- ctr + 1
}
rownames(Activitydata) <- paste(Activitydata$V1,rep_ID,sep="_")
colnames(Activitydata) <- c("ID",featNames,"class")
Activitydata$rep <- rep_ID
  
tb <- table(Activitydata$class)

classes <- c("Neu","Emo","Men","Phy")
names(classes) <- names(tb)
Activitydata$class <- classes[as.character(Activitydata$class)]
table(Activitydata$class)
#> 
#>  Emo  Men  Neu  Phy 
#> 1120 1120 1120 1120



ID_class <- paste(Activitydata$ID,Activitydata$class)
IDCLASS <- unique(ID_class)
theclass <- Activitydata$class[!duplicated(ID_class)]
theIDs <- Activitydata$ID[!duplicated(ID_class)]

ActivitydataAvg <- NULL
for (id in IDCLASS)
{
  ActivitydataAvg <- rbind(ActivitydataAvg,apply(Activitydata[ID_class==id,featNames],2,mean))
}
colnames(ActivitydataAvg) <- featNames
rownames(ActivitydataAvg) <- IDCLASS
ActivitydataAvg <- as.data.frame(ActivitydataAvg)
ActivitydataAvg$class <- theclass
ActivitydataAvg$ID <- theIDs

table(ActivitydataAvg$class)
#> 
#> Emo Men Neu Phy 
#>  40  40  40  40


ActivitydataAvg <- subset(ActivitydataAvg, class=="Men" | class=="Emo")

ActivitydataAvg$class <- 1*(ActivitydataAvg$class == "Men")
table(ActivitydataAvg$class)
#> 
#>  0  1 
#> 40 40

1.2.0.1 Standarize the names for the reporting

studyName <- "Activity"
dataframe <- ActivitydataAvg
outcome <- "class"

TopVariables <- 10
thro <- 0.80
cexheat = 0.15

1.3 Generaring the report

1.3.1 Libraries

Some libraries

library(psych)
library(whitening)
library("vioplot")
library("rpart")

1.3.2 Data specs

pander::pander(c(rows=nrow(dataframe),col=ncol(dataframe)-1))
rows col
80 534
pander::pander(table(dataframe[,outcome]))
0 1
40 40

varlist <- colnames(dataframe)
varlist <- varlist[varlist != outcome]

largeSet <- length(varlist) > 1500 

1.3.3 Scaling the data

Scaling and removing near zero variance columns and highly co-linear(r>0.99999) columns


  ### Some global cleaning
  sdiszero <- apply(dataframe,2,sd) > 1.0e-16
  dataframe <- dataframe[,sdiszero]

  varlist <- colnames(dataframe)[colnames(dataframe) != outcome]
  tokeep <- c(as.character(correlated_Remove(dataframe,varlist,thr=0.99999)),outcome)
  dataframe <- dataframe[,tokeep]

  varlist <- colnames(dataframe)
  varlist <- varlist[varlist != outcome]
  
  iscontinous <- sapply(apply(dataframe,2,unique),length) > 5 ## Only variables with enough samples



dataframeScaled <- FRESAScale(dataframe,method="OrderLogit")$scaledData

1.4 The heatmap of the data

numsub <- nrow(dataframe)
if (numsub > 1000) numsub <- 1000


if (!largeSet)
{

  hm <- heatMaps(data=dataframeScaled[1:numsub,],
                 Outcome=outcome,
                 Scale=TRUE,
                 hCluster = "row",
                 xlab="Feature",
                 ylab="Sample",
                 srtCol=45,
                 srtRow=45,
                 cexCol=cexheat,
                 cexRow=cexheat
                 )
  par(op)
}

1.4.0.1 Correlation Matrix of the Data

The heat map of the data


if (!largeSet)
{

  par(cex=0.6,cex.main=0.85,cex.axis=0.7)
  #cormat <- Rfast::cora(as.matrix(dataframe[,varlist]),large=TRUE)
  cormat <- cor(dataframe[,varlist],method="pearson")
  cormat[is.na(cormat)] <- 0
  gplots::heatmap.2(abs(cormat),
                    trace = "none",
  #                  scale = "row",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "Original Correlation",
                    cexRow = cexheat,
                    cexCol = cexheat,
                     srtCol=45,
                     srtRow=45,
                    key.title=NA,
                    key.xlab="|Pearson Correlation|",
                    xlab="Feature", ylab="Feature")
  diag(cormat) <- 0
  print(max(abs(cormat)))
}

[1] 1

1.5 The decorrelation


DEdataframe <- IDeA(dataframe,verbose=TRUE,thr=thro)
#> 
#>  Included: 362 , Uni p: 0.0184537 , Uncorrelated Base: 33 , Outcome-Driven Size: 0 , Base Size: 33 
#> 
#> 
 1 <R=1.000,r=0.975,N=  255>, Top: 43( 1 )[ 1 : 43 Fa= 42 : 0.975 ]( 42 , 169 , 0 ),<|>Tot Used: 211 , Added: 169 , Zero Std: 0 , Max Cor: 1.000
#> 
 2 <R=1.000,r=0.975,N=  255>, Top: 32( 1 )[ 1 : 32 Fa= 70 : 0.975 ]( 32 , 95 , 42 ),<|>Tot Used: 242 , Added: 95 , Zero Std: 0 , Max Cor: 1.000
#> 
 3 <R=1.000,r=0.975,N=  255>, Top: 14( 2 )[ 1 : 14 Fa= 82 : 0.975 ]( 14 , 28 , 70 ),<|>Tot Used: 246 , Added: 28 , Zero Std: 0 , Max Cor: 1.000
#> 
 4 <R=1.000,r=0.975,N=  255>, Top: 8( 8 )[ 1 : 8 Fa= 89 : 0.975 ]( 8 , 23 , 82 ),<|>Tot Used: 246 , Added: 23 , Zero Std: 0 , Max Cor: 1.000
#> 
 5 <R=1.000,r=0.950,N=  115>, Top: 41( 4 )[ 1 : 41 Fa= 101 : 0.950 ]( 38 , 62 , 89 ),<|>Tot Used: 274 , Added: 62 , Zero Std: 0 , Max Cor: 1.000
#> 
 6 <R=1.000,r=0.950,N=  115>, Top: 14( 1 )[ 1 : 14 Fa= 109 : 0.950 ]( 14 , 18 , 101 ),<|>Tot Used: 274 , Added: 18 , Zero Std: 0 , Max Cor: 1.000
#> 
 7 <R=1.000,r=0.950,N=  115>, Top: 3( 4 )[ 1 : 3 Fa= 110 : 0.950 ]( 3 , 6 , 109 ),<|>Tot Used: 274 , Added: 6 , Zero Std: 0 , Max Cor: 1.000
#> 
 8 <R=1.000,r=0.900,N=  134>, Top: 48( 4 )[ 1 : 48 Fa= 117 : 0.900 ]( 44 , 69 , 110 ),<|>Tot Used: 295 , Added: 69 , Zero Std: 0 , Max Cor: 1.000
#> 
 9 <R=1.000,r=0.900,N=  134>, Top: 11( 4 )[ 1 : 11 Fa= 120 : 0.900 ]( 9 , 13 , 117 ),<|>Tot Used: 299 , Added: 13 , Zero Std: 0 , Max Cor: 1.000
#> 
 10 <R=1.000,r=0.850,N=  109>, Top: 45( 3 )[ 1 : 45 Fa= 138 : 0.860 ]( 44 , 59 , 120 ),<|>Tot Used: 321 , Added: 59 , Zero Std: 0 , Max Cor: 0.995
#> 
 11 <R=0.995,r=0.847,N=  109>, Top: 8( 2 )[ 1 : 8 Fa= 140 : 0.847 ]( 8 , 9 , 138 ),<|>Tot Used: 321 , Added: 9 , Zero Std: 0 , Max Cor: 0.961
#> 
 12 <R=0.961,r=0.800,N=   99>, Top: 40( 2 )[ 1 : 40 Fa= 145 : 0.800 ]( 39 , 55 , 140 ),<|>Tot Used: 329 , Added: 55 , Zero Std: 0 , Max Cor: 0.982
#> 
 13 <R=0.982,r=0.800,N=   99>, Top: 15( 1 )[ 1 : 15 Fa= 148 : 0.800 ]( 15 , 20 , 145 ),<|>Tot Used: 330 , Added: 20 , Zero Std: 0 , Max Cor: 0.999
#> 
 14 <R=0.999,r=0.800,N=   99>, Top: 6( 1 )[ 1 : 6 Fa= 151 : 0.800 ]( 6 , 6 , 148 ),<|>Tot Used: 330 , Added: 6 , Zero Std: 0 , Max Cor: 0.966
#> 
 15 <R=0.966,r=0.800,N=    2>, Top: 1( 1 )[ 1 : 1 Fa= 152 : 0.800 ]( 1 , 1 , 151 ),<|>Tot Used: 330 , Added: 1 , Zero Std: 0 , Max Cor: 0.800
#> 
 16 <R=0.800,r=0.800,N=    0>
#> 
 [ 16 ], 0.8965018 Decor Dimension: 330 Nused: 330 . Cor to Base: 197 , ABase: 16 , Outcome Base: 0 
#> 
varlistc <- colnames(DEdataframe)[colnames(DEdataframe) != outcome]

pander::pander(sum(apply(dataframe[,varlist],2,var)))

2.43e+21

pander::pander(sum(apply(DEdataframe[,varlistc],2,var)))

5.08e+14

pander::pander(entropy(discretize(unlist(dataframe[,varlist]), 256)))

0.0195

pander::pander(entropy(discretize(unlist(DEdataframe[,varlistc]), 256)))

0.0691

1.5.1 The decorrelation matrix


if (!largeSet)
{

  par(cex=0.6,cex.main=0.85,cex.axis=0.7)
  
  UPSTM <- attr(DEdataframe,"UPSTM")
  
  gplots::heatmap.2(1.0*(abs(UPSTM)>0),
                    trace = "none",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "Decorrelation matrix",
                    cexRow = cexheat,
                    cexCol = cexheat,
                   srtCol=45,
                   srtRow=45,
                    key.title=NA,
                    key.xlab="|Beta|>0",
                    xlab="Output Feature", ylab="Input Feature")
  
  par(op)
}

1.6 The heatmap of the decorrelated data

if (!largeSet)
{

  hm <- heatMaps(data=DEdataframe[1:numsub,],
                 Outcome=outcome,
                 Scale=TRUE,
                 hCluster = "row",
                 cexRow = cexheat,
                 cexCol = cexheat,
                 srtCol=45,
                 srtRow=45,
                 xlab="Feature",
                 ylab="Sample")
  par(op)
}

1.7 The correlation matrix after decorrelation

if (!largeSet)
{

  cormat <- cor(DEdataframe[,varlistc],method="pearson")
  cormat[is.na(cormat)] <- 0
  
  gplots::heatmap.2(abs(cormat),
                    trace = "none",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "Correlation after IDeA",
                    cexRow = cexheat,
                    cexCol = cexheat,
                     srtCol=45,
                     srtRow=45,
                    key.title=NA,
                    key.xlab="|Pearson Correlation|",
                    xlab="Feature", ylab="Feature")
  
  par(op)
  diag(cormat) <- 0
  print(max(abs(cormat)))
}

[1] 1

1.8 U-MAP Visualization of features

1.8.1 The UMAP based on LASSO on Raw Data


if (nrow(dataframe) < 1000)
{
  classes <- unique(dataframe[1:numsub,outcome])
  raincolors <- rainbow(length(classes))
  names(raincolors) <- classes
  datasetframe.umap = umap(scale(dataframe[1:numsub,varlist]),n_components=2)
  plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: Original",t='n')
  text(datasetframe.umap$layout,labels=dataframe[1:numsub,outcome],col=raincolors[dataframe[1:numsub,outcome]+1])
}

1.8.2 The decorralted UMAP

if (nrow(dataframe) < 1000)
{

  datasetframe.umap = umap(scale(DEdataframe[1:numsub,varlistc]),n_components=2)
  plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: After IDeA",t='n')
  text(datasetframe.umap$layout,labels=DEdataframe[1:numsub,outcome],col=raincolors[DEdataframe[1:numsub,outcome]+1])
}

1.9 Univariate Analysis

1.9.1 Univariate



univarRAW <- uniRankVar(varlist,
               paste(outcome,"~1"),
               outcome,
               dataframe,
               rankingTest="AUC")

100 : ECG_p_LF_mean 200 : IT_CCV_LF 300 : EDA_Original_mad_D




univarDe <- uniRankVar(varlistc,
               paste(outcome,"~1"),
               outcome,
               DEdataframe,
               rankingTest="AUC",
               )

100 : La_ECG_p_LF_mean 200 : La_IT_CCV_LF 300 : La_EDA_Original_mad_D

1.9.2 Final Table


univariate_columns <- c("caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC")

##topfive
topvar <- c(1:length(varlist)) <= TopVariables
pander::pander(univarRAW$orderframe[topvar,univariate_columns])
  caseMean caseStd controlMean controlStd controlKSP ROCAUC
ECG_hrv_prctile75 -2.719 8.9741 -5.8408 10.9387 0.00378 0.731
ECG_hrv_geomean_A_ 10.370 8.0477 13.8743 7.8682 0.00822 0.727
IT_LF_baseline_D 43.569 25.1600 26.4345 15.3449 0.61672 0.721
IT_p_Total_baseline 51.974 29.4230 31.5389 17.9800 0.71076 0.721
IT_VLF_baseline 57.578 32.3488 34.9418 19.7865 0.75500 0.720
ECG_hrv_prctile25 -12.471 7.7963 -16.6349 6.7770 0.20654 0.719
IT_PSD_baseline 0.059 0.0376 0.0358 0.0228 0.21000 0.715
ECG_hrv_mean -7.294 5.4920 -11.0369 6.0390 0.33833 0.714
IT_HF_baseline 3.308 3.5341 2.0033 2.1158 0.00564 0.713
ECG_hrv_trimmean25 -7.619 6.1613 -11.3761 6.3193 0.35724 0.711


topLAvar <- univarDe$orderframe$Name[str_detect(univarDe$orderframe$Name,"La_")]
topLAvar <- unique(c(univarDe$orderframe$Name[topvar],topLAvar[1:as.integer(TopVariables/2)]))
finalTable <- univarDe$orderframe[topLAvar,univariate_columns]

theLaVar <- rownames(finalTable)[str_detect(rownames(finalTable),"La_")]

pander::pander(univarDe$orderframe[topLAvar,univariate_columns])
  caseMean caseStd controlMean controlStd controlKSP ROCAUC
La_ECG_RR_window_baseline 19.766 11.454 10.652 10.519 0.62646 0.776
La_EDA_Original_mad_D -7.578 10.562 1.170 9.564 0.01215 0.776
La_ECG_HR_min_div_std 0.533 0.674 1.130 0.819 0.22297 0.762
La_IT_BRV_baseline -3.930 2.265 -2.114 1.544 0.80056 0.756
La_EDA_Original_baseline_D -558.607 1630.312 714.724 1166.346 0.03382 0.746
La_ECG_RR_window_harmmean 0.595 0.380 0.375 0.411 0.04531 0.734
La_EDA_Original_max_D 127.623 285.226 -44.023 189.364 0.00307 0.731
La_ECG_RR_window_mad 5.227 3.767 8.361 5.205 0.64092 0.722
La_EDA_processed_trimmean25_D 0.999 2.270 -0.379 1.094 0.15207 0.721
La_EDA_Original_std_D -15.390 75.369 74.185 136.981 0.01610 0.719

dc <- getLatentCoefficients(DEdataframe)
fscores <- attr(DEdataframe,"fscore")

theSigDc <- dc[theLaVar]
names(theSigDc) <- NULL
theSigDc <- unique(names(unlist(theSigDc)))


theFormulas <- dc[rownames(finalTable)]
deFromula <- character(length(theFormulas))
names(deFromula) <- rownames(finalTable)

pander::pander(c(mean=mean(sapply(dc,length)),total=length(dc),fraction=length(dc)/(ncol(dataframe)-1)))
mean total fraction
3.33 301 0.825


allSigvars <- names(dc)



dx <- names(deFromula)[1]
for (dx in names(deFromula))
{
  coef <- theFormulas[[dx]]
  cname <- names(theFormulas[[dx]])
  names(cname) <- cname
  for (cf in names(coef))
  {
    if (cf != dx)
    {
      if (coef[cf]>0)
      {
        deFromula[dx] <- paste(deFromula[dx],
                               sprintf("+ %5.3f*%s",coef[cf],cname[cf]))
      }
      else
      {
        deFromula[dx] <- paste(deFromula[dx],
                               sprintf("%5.3f*%s",coef[cf],cname[cf]))
      }
    }
  }
}

finalTable <- rbind(finalTable,univarRAW$orderframe[theSigDc[!(theSigDc %in% rownames(finalTable))],univariate_columns])


orgnamez <- rownames(finalTable)
orgnamez <- str_remove_all(orgnamez,"La_")
finalTable$RAWAUC <- univarRAW$orderframe[orgnamez,"ROCAUC"]
finalTable$DecorFormula <- deFromula[rownames(finalTable)]
finalTable$fscores <- fscores[rownames(finalTable)]

Final_Columns <- c("DecorFormula","caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC","RAWAUC","fscores")

finalTable <- finalTable[order(-finalTable$ROCAUC),]
pander::pander(finalTable[,Final_Columns])
  DecorFormula caseMean caseStd controlMean controlStd controlKSP ROCAUC RAWAUC fscores
La_ECG_RR_window_baseline -0.927ECG_RR_window_mean + 1.000ECG_RR_window_baseline 19.766 11.454 10.652 10.519 0.626465 0.776 0.522 -1
La_EDA_Original_mad_D -0.831EDA_Original_std_D + 1.000EDA_Original_mad_D -7.578 10.562 1.170 9.564 0.012150 0.776 0.507 0
La_ECG_HR_min_div_std + 1.000ECG_HR_min_div_std -2.612ECG_hrv_std + 1.827*ECG_hrv_mad 0.533 0.674 1.130 0.819 0.222968 0.762 0.579 -2
La_IT_BRV_baseline -0.588IT_BRV_mean + 1.000IT_BRV_baseline -3.930 2.265 -2.114 1.544 0.800556 0.756 0.642 -1
La_EDA_Original_baseline_D -0.895EDA_Original_mean_D + 1.000EDA_Original_baseline_D -558.607 1630.312 714.724 1166.346 0.033817 0.746 0.504 -1
La_ECG_RR_window_harmmean + 0.715ECG_RR_window_mean -1.710ECG_RR_window_geomean_A_ + 1.000ECG_RR_window_harmmean + 0.702ECG_hrv_mean -0.664*ECG_hrv_trimmean25 0.595 0.380 0.375 0.411 0.045307 0.734 0.603 -3
La_EDA_Original_max_D -4.171EDA_Original_mean_D + 1.000EDA_Original_max_D + 3.174*EDA_Original_prctile25_D 127.623 285.226 -44.023 189.364 0.003074 0.731 0.575 -2
La_ECG_RR_window_mad + 1.000ECG_RR_window_mad -1.741ECG_hrv_mad 5.227 3.767 8.361 5.205 0.640925 0.722 0.637 2
La_EDA_processed_trimmean25_D + 1.000EDA_processed_trimmean25_D -0.652EDA_processed_median_D 0.999 2.270 -0.379 1.094 0.152073 0.721 0.582 -1
La_EDA_Original_std_D + 1.000EDA_Original_std_D -2.861EDA_processed_std_D -15.390 75.369 74.185 136.981 0.016099 0.719 0.510 3
ECG_hrv_mean NA -7.294 5.492 -11.037 6.039 0.338332 0.714 0.714 5
ECG_hrv_trimmean25 NA -7.619 6.161 -11.376 6.319 0.357244 0.711 0.711 NA
IT_BRV_baseline NA -2.921 4.580 -1.086 5.893 0.082008 0.642 0.642 NA
ECG_RR_window_mad NA 15.323 11.931 19.343 15.318 0.016755 0.637 0.637 NA
ECG_RR_window_mean NA 213.082 36.977 225.302 36.955 0.959974 0.608 0.608 10
ECG_RR_window_geomean_A_ NA 211.244 38.262 222.696 38.877 0.944821 0.603 0.603 NA
ECG_RR_window_harmmean NA 209.548 39.257 220.307 40.355 0.933015 0.603 0.603 NA
EDA_processed_trimmean25_D NA -2.813 7.265 -3.429 6.705 0.007616 0.582 0.582 NA
EDA_processed_std_D NA 90.946 122.230 58.264 59.707 0.014159 0.581 0.581 NA
ECG_HR_min_div_std NA 8.572 9.261 9.524 10.152 0.000285 0.579 0.579 NA
EDA_Original_max_D NA 6539.842 7764.396 4983.886 6585.632 0.003534 0.575 0.575 NA
EDA_Original_prctile25_D NA 5745.512 7290.266 4265.562 6006.293 0.006278 0.573 0.573 NA
EDA_Original_mean_D NA 5909.653 7389.707 4451.522 6159.561 0.005912 0.569 0.569 18
EDA_processed_median_D NA -5.842 11.119 -4.675 9.408 0.007767 0.530 0.530 NA
ECG_RR_window_baseline NA 217.376 37.213 219.594 36.241 0.705034 0.522 0.522 NA
ECG_hrv_std NA 7.132 8.339 7.624 9.462 0.000275 0.514 0.514 NA
ECG_hrv_mad NA 5.799 6.907 6.308 8.000 0.000175 0.514 0.514 40
EDA_Original_std_D NA 244.767 321.426 240.854 284.853 0.015456 0.510 0.510 NA
EDA_Original_mad_D NA 195.764 262.605 201.261 242.021 0.012852 0.507 0.507 NA
EDA_Original_baseline_D NA 4732.181 6363.016 4700.077 6278.106 0.003312 0.504 0.504 NA
IT_BRV_mean NA 1.717 8.347 1.749 8.360 0.270100 0.503 0.503 21

1.10 Comparing IDeA vs PCA vs EFA

1.10.1 PCA

featuresnames <- colnames(dataframe)[colnames(dataframe) != outcome]
pc <- prcomp(dataframe[,iscontinous],center = TRUE,tol=0.002)   #principal components
predPCA <- predict(pc,dataframe[,iscontinous])
PCAdataframe <- as.data.frame(cbind(predPCA,dataframe[,!iscontinous]))
colnames(PCAdataframe) <- c(colnames(predPCA),colnames(dataframe)[!iscontinous]) 
#plot(PCAdataframe[,colnames(PCAdataframe)!=outcome],col=dataframe[,outcome],cex=0.65,cex.lab=0.5,cex.axis=0.75,cex.sub=0.5,cex.main=0.75)

#pander::pander(pc$rotation)


PCACor <- cor(PCAdataframe[,colnames(PCAdataframe) != outcome])


  gplots::heatmap.2(abs(PCACor),
                    trace = "none",
  #                  scale = "row",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "PCA Correlation",
                    cexRow = 0.5,
                    cexCol = 0.5,
                     srtCol=45,
                     srtRow= -45,
                    key.title=NA,
                    key.xlab="Pearson Correlation",
                    xlab="Feature", ylab="Feature")

1.10.2 EFA


EFAdataframe <- dataframeScaled

if (length(iscontinous) < 2000)
{
  topred <- min(length(iscontinous),nrow(dataframeScaled),ncol(predPCA)/2)
  if (topred < 2) topred <- 2
  
  uls <- fa(dataframeScaled[,iscontinous],nfactors=topred,rotate="varimax",warnings=FALSE)  # EFA analysis
  predEFA <- predict(uls,dataframeScaled[,iscontinous])
  EFAdataframe <- as.data.frame(cbind(predEFA,dataframeScaled[,!iscontinous]))
  colnames(EFAdataframe) <- c(colnames(predEFA),colnames(dataframeScaled)[!iscontinous]) 


  
  EFACor <- cor(EFAdataframe[,colnames(EFAdataframe) != outcome])
  
  
    gplots::heatmap.2(abs(EFACor),
                      trace = "none",
    #                  scale = "row",
                      mar = c(5,5),
                      col=rev(heat.colors(5)),
                      main = "EFA Correlation",
                      cexRow = 0.5,
                      cexCol = 0.5,
                       srtCol=45,
                       srtRow= -45,
                      key.title=NA,
                      key.xlab="Pearson Correlation",
                      xlab="Feature", ylab="Feature")
}

1.11 Effect on CAR modeling

par(op)
par(xpd = TRUE)
dataframe[,outcome] <- factor(dataframe[,outcome])
rawmodel <- rpart(paste(outcome,"~."),dataframe,control=rpart.control(maxdepth=3))
pr <- predict(rawmodel,dataframe,type = "class")

  ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
  if (length(unique(pr))>1)
  {
    plot(rawmodel,main="Raw",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
    text(rawmodel, use.n = TRUE,cex=0.75)
    ptab <- epiR::epi.tests(table(pr==0,dataframe[,outcome]==0))
  }


pander::pander(table(dataframe[,outcome],pr))
  0 1
0 38 2
1 7 33
pander::pander(ptab)
  • detail:

    statistic est lower upper
    ap 0.4375 0.32676 0.553
    tp 0.5000 0.38605 0.614
    se 0.8250 0.67221 0.927
    sp 0.9500 0.83080 0.994
    diag.ac 0.8875 0.79715 0.947
    diag.or 89.5714 17.38881 461.391
    nndx 1.2903 1.08636 1.988
    youden 0.7750 0.50301 0.921
    pv.pos 0.9429 0.80843 0.993
    pv.neg 0.8444 0.70545 0.935
    lr.pos 16.5000 4.24197 64.180
    lr.neg 0.1842 0.09364 0.362
    p.rout 0.5625 0.44700 0.673
    p.rin 0.4375 0.32676 0.553
    p.tpdn 0.0500 0.00611 0.169
    p.tndp 0.1750 0.07338 0.328
    p.dntp 0.0571 0.00700 0.192
    p.dptn 0.1556 0.06491 0.295
  • tab:

      Outcome + Outcome - Total
    Test + 33 2 35
    Test - 7 38 45
    Total 40 40 80
  • method: exact

  • digits: 2

  • conf.level: 0.95

pander::pander(ptab$detail[c(5,3,4,6),])
  statistic est lower upper
5 diag.ac 0.887 0.797 0.947
3 se 0.825 0.672 0.927
4 sp 0.950 0.831 0.994
6 diag.or 89.571 17.389 461.391

par(op)
par(xpd = TRUE)
DEdataframe[,outcome] <- factor(DEdataframe[,outcome])
IDeAmodel <- rpart(paste(outcome,"~."),DEdataframe,control=rpart.control(maxdepth=3))
pr <- predict(IDeAmodel,DEdataframe,type = "class")

  ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
  if (length(unique(pr))>1)
  {
    plot(IDeAmodel,main="IDeA",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
    text(IDeAmodel, use.n = TRUE,cex=0.75)
    ptab <- epiR::epi.tests(table(pr==0,DEdataframe[,outcome]==0))
  }

pander::pander(table(DEdataframe[,outcome],pr))
  0 1
0 34 6
1 3 37
pander::pander(ptab)
  • detail:

    statistic est lower upper
    ap 0.5375 0.4224 0.650
    tp 0.5000 0.3860 0.614
    se 0.9250 0.7961 0.984
    sp 0.8500 0.7016 0.943
    diag.ac 0.8875 0.7972 0.947
    diag.or 69.8889 16.1978 301.551
    nndx 1.2903 1.0786 2.009
    youden 0.7750 0.4978 0.927
    pv.pos 0.8605 0.7207 0.947
    pv.neg 0.9189 0.7809 0.983
    lr.pos 6.1667 2.9335 12.963
    lr.neg 0.0882 0.0295 0.264
    p.rout 0.4625 0.3503 0.578
    p.rin 0.5375 0.4224 0.650
    p.tpdn 0.1500 0.0571 0.298
    p.tndp 0.0750 0.0157 0.204
    p.dntp 0.1395 0.0530 0.279
    p.dptn 0.0811 0.0170 0.219
  • tab:

      Outcome + Outcome - Total
    Test + 37 6 43
    Test - 3 34 37
    Total 40 40 80
  • method: exact

  • digits: 2

  • conf.level: 0.95

pander::pander(ptab$detail[c(5,3,4,6),])
  statistic est lower upper
5 diag.ac 0.887 0.797 0.947
3 se 0.925 0.796 0.984
4 sp 0.850 0.702 0.943
6 diag.or 69.889 16.198 301.551

par(op)
par(xpd = TRUE)
PCAdataframe[,outcome] <- factor(PCAdataframe[,outcome])
PCAmodel <- rpart(paste(outcome,"~."),PCAdataframe,control=rpart.control(maxdepth=3))
pr <- predict(PCAmodel,PCAdataframe,type = "class")
ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
if (length(unique(pr))>1)
{
  plot(PCAmodel,main="PCA",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
  text(PCAmodel, use.n = TRUE,cex=0.75)
  ptab <- epiR::epi.tests(table(pr==0,PCAdataframe[,outcome]==0))
}

pander::pander(table(PCAdataframe[,outcome],pr))
  0 1
0 22 18
1 7 33
pander::pander(ptab)
  • detail:

    statistic est lower upper
    ap 0.637 0.5224 0.742
    tp 0.500 0.3860 0.614
    se 0.825 0.6722 0.927
    sp 0.550 0.3849 0.707
    diag.ac 0.688 0.5741 0.787
    diag.or 5.762 2.0647 16.079
    nndx 2.667 1.5772 17.508
    youden 0.375 0.0571 0.634
    pv.pos 0.647 0.5007 0.776
    pv.neg 0.759 0.5646 0.897
    lr.pos 1.833 1.2649 2.657
    lr.neg 0.318 0.1535 0.660
    p.rout 0.362 0.2579 0.478
    p.rin 0.637 0.5224 0.742
    p.tpdn 0.450 0.2926 0.615
    p.tndp 0.175 0.0734 0.328
    p.dntp 0.353 0.2243 0.499
    p.dptn 0.241 0.1030 0.435
  • tab:

      Outcome + Outcome - Total
    Test + 33 18 51
    Test - 7 22 29
    Total 40 40 80
  • method: exact

  • digits: 2

  • conf.level: 0.95

pander::pander(ptab$detail[c(5,3,4,6),])
  statistic est lower upper
5 diag.ac 0.688 0.574 0.787
3 se 0.825 0.672 0.927
4 sp 0.550 0.385 0.707
6 diag.or 5.762 2.065 16.079


par(op)

1.11.1 EFA


  EFAdataframe[,outcome] <- factor(EFAdataframe[,outcome])
  EFAmodel <- rpart(paste(outcome,"~."),EFAdataframe,control=rpart.control(maxdepth=3))
  pr <- predict(EFAmodel,EFAdataframe,type = "class")
  
  ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
  if (length(unique(pr))>1)
  {
    plot(EFAmodel,main="EFA",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
    text(EFAmodel, use.n = TRUE,cex=0.75)
    ptab <- epiR::epi.tests(table(pr==0,EFAdataframe[,outcome]==0))
  }


  pander::pander(table(EFAdataframe[,outcome],pr))
  0 1
0 18 22
1 6 34
  pander::pander(ptab)
  • detail:

    statistic est lower upper
    ap 0.700 5.87e-01 0.797
    tp 0.500 3.86e-01 0.614
    se 0.850 7.02e-01 0.943
    sp 0.450 2.93e-01 0.615
    diag.ac 0.650 5.35e-01 0.753
    diag.or 4.636 1.59e+00 13.494
    nndx 3.333 -1.73e+02 1.792
    youden 0.300 -5.76e-03 0.558
    pv.pos 0.607 4.68e-01 0.735
    pv.neg 0.750 5.33e-01 0.902
    lr.pos 1.545 1.13e+00 2.105
    lr.neg 0.333 1.48e-01 0.752
    p.rout 0.300 2.03e-01 0.413
    p.rin 0.700 5.87e-01 0.797
    p.tpdn 0.550 3.85e-01 0.707
    p.tndp 0.150 5.71e-02 0.298
    p.dntp 0.393 2.65e-01 0.532
    p.dptn 0.250 9.77e-02 0.467
  • tab:

      Outcome + Outcome - Total
    Test + 34 22 56
    Test - 6 18 24
    Total 40 40 80
  • method: exact

  • digits: 2

  • conf.level: 0.95

  pander::pander(ptab$detail[c(5,3,4,6),])
  statistic est lower upper
5 diag.ac 0.65 0.535 0.753
3 se 0.85 0.702 0.943
4 sp 0.45 0.293 0.615
6 diag.or 4.64 1.593 13.494
  par(op)